Text Cleaning

Clean out punctuation

clean_punc <- function(text) {
    x <- text
    x <- gsub("[[:punct:]]", " ", x)  #removes punctuation
    return(x)
}

# test function
clean_punc("hello?!?!?!")
## [1] "hello      "

Clean out specific words

clean_a_the <- function(text) {
    x <- text
    x <- gsub("\\ba\\b", " ", x)  #removes the word 'a'
    x <- gsub("\\bthe\\b", " ", x)  #removes the word 'the'
    return(x)
}

# test function
clean_a_the("a banana is a fruit that the child liked")
## [1] "  banana is   fruit that   child liked"

Clean out digits

clean_digits <- function(text) {
    x <- text
    x <- gsub("[0-9]+", " ", x)  #removes digits 0-9
    return(x)
}

# test function
clean_digits("This1 is2 a3 test4")
## [1] "This  is  a  test "

Clean out extra spaces

clean_extraspaces <- function(text) {
    x <- text
    x <- gsub(" {2,}", " ", x)  #removes excess spaces AKA converts instances of >1 space to just one space
    return(x)
}

# test function
clean_extraspaces("This  is a  test")
## [1] "This is a test"

Tokenizing by word in a data frame

Using unlist function

# Load your string of text
mystring <- "This is a test string"

# Use unlist function to split to one word per line under a column labelled
# 'word'
test_df <- data.frame(word = unlist(strsplit(mystring, "\\s+")))
test_df
x
This
is
a
test
string

Using unnest_tokens function

# Load your string of text
mystring2 <- "This is a second test string"

# Put string in tibble format per textbook
test2_df <- tibble(text = mystring2)
test2_df
text
This is a second test string
# Unnest tokens to one word per line in data frame
test2_df %>%
    unnest_tokens(word, text)
word
this
is
a
second
test
string

Calculate TTR

Manually using nrow and unique

# Use your tokenized df
test_df
x
This
is
a
test
string
# Calculate number of rows using nrow
nrow(test_df)
## [1] 5
# Calculate number of unique rows
length(unique(test_df$word))
## [1] 5
# Calculate TTR as number of rows / number of unique rows
test_TTR <- nrow(test_df)/length(unique(test_df$word))
test_TTR
## [1] 1